
Wellcome Centre for Anti-Infectives Research
School of Life Sciences, University of Dundee
#reload when modified
%load_ext autoreload
%autoreload 2
#set up code
import sys
sys.path.append('../../ProLib/')
from ProteomicsUtility import utilities as PTUT
import ProtRank
import warnings
warnings.filterwarnings("ignore")
#define helphttp://localhost:8888/notebooks/calvin/new_data/analysis_427_2018.ipynb#ing function
import os
from tqdm import tqdm_notebook
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from adjustText import adjust_text
from matplotlib.lines import Line2D
from Bio import SeqIO
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
#E:\old_ege_nucleus\nuc_txt\txt
RAW_FILE_PATH =os.path.join('')
PREFIX = 'combined'
#PREFIX = 'combined_add_esag_unique'
TXT_PATH=os.path.join(RAW_FILE_PATH, PREFIX, 'txt')
OUT_FOLDER = os.path.join(TXT_PATH, 'results')
import os
if not os.path.exists(OUT_FOLDER):
os.makedirs(OUT_FOLDER)
print('created out folder')
else:
print('out folder exist')
#pd.DataFrame.sort_values??
#read data and log transform for plots
df = pd.read_csv(os.path.join(TXT_PATH,'proteinGroups.txt'),sep='\t')
df = PTUT.clean_df(df, score=5, unique_pep_threshold=1)
df = PTUT.mod_df(df)
print(df.shape)
df['pep_stats']=[int(n.split(';')[0]) for n in df['Peptide counts (all)']]
df['pep_stats'].describe()
desc_927= dict(zip(df['Gene_id'],df['desc']))
list(df.columns)[0:10]
list(df.columns)
wt_headers = [
'Reporter intensity corrected 7',
'Reporter intensity corrected 8',
'Reporter intensity corrected 9',]
plus_headers = [
'Reporter intensity corrected 4',
'Reporter intensity corrected 5',
'Reporter intensity corrected 6']
minus_headers =[
'Reporter intensity corrected 1',
'Reporter intensity corrected 2',
'Reporter intensity corrected 3']
#list(df.columns)
cols = wt_headers+plus_headers+minus_headers
selection = df[cols]
selection.head()
print(selection.shape)
selection = selection[(selection.T != 0).any()]
print(selection.shape)
selection.describe()
new_dm_columns = ['WT'+str(n+1) for n in range(3)]
new_plus_columns= ['Plus'+str(n+1) for n in range(3)]
new_minus_columns= ['Minus'+str(n+1) for n in range(3)]
selection.columns = new_dm_columns+new_plus_columns+new_minus_columns
temp = pd.concat([df[['Gene_id']], selection],axis=1)
temp = temp[(temp.iloc[:,1:].T != 0).any()]
temp.to_csv(os.path.join(OUT_FOLDER,'indata_prank.txt'),sep='\t',index=False)
import missingno as msno
#visualization of missing data
ax=msno.bar(selection.replace(0,np.nan),figsize=(6, 6))
plt.title('Missing Data Analysis', size=12)
ax.set_ylabel('Fraction of data points',size=12)
plt.savefig(os.path.join(OUT_FOLDER,'missing_value_bar.png'))
plt.tight_layout()
plt.show()
#print(data.shape)
msno.matrix(selection.replace(0,np.nan), figsize=(6, 6))
#plt.title('Missing Data')
plt.savefig(os.path.join(OUT_FOLDER,'missing_value_matrix.png'))
plt.show()
#pal = sns.color_palette()
palette=['r','r','r','g','g','g','b','b','b']
palette_g = ['r','g','b']
color_dictionary = {'r':'WT','g':'Plus','b':'Minus'}
np.log10(selection).head()
np.log10(selection+1).plot(kind='kde', color=palette, alpha=0.5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Value Distribution')
plt.xlabel('Log10 TMT')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_kde.png'))
plt.show()
sns.boxplot(data =np.log10(selection+1),showfliers=False,palette=palette)
plt.title('Value Distribution')
plt.xlabel('Log10 LFQ')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_box.png'))
plt.show()
import seaborn as sns
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(np.log1p(selection).dropna().corr(),
annot=True, linewidths=.5, ax=ax,cmap="Blues")
plt.savefig(os.path.join(OUT_FOLDER,'corr_heatmap.png'))
plt.show()
remove any zeros
#utilities.plot_correlation(.sample(500))
#fig,ax=plt.subplots(figsize=(8,8))
g = sns.pairplot(np.log1p(selection).dropna().sample(500))
plt.savefig(os.path.join(OUT_FOLDER,'corr_pairplot.png'))
plt.show()
con_columns = ['WT'+str(n+1) for n in range(3)]
ind_columns= ['Plus'+str(n+1) for n in range(3)]
ind_r_columns = ['Minus'+str(n+1) for n in range(3)]
import seaborn as sns
cv = selection.copy()
cv = cv.dropna()
cv['mean_c'] = cv[con_columns].mean(axis=1)
cv['std_c'] = cv[con_columns].std(axis=1)
cv['mean_i'] = cv[ind_columns].mean(axis=1)
cv['std_i'] = cv[ind_columns].std(axis=1)
cv['mean_ir'] = cv[ind_columns].mean(axis=1)
cv['std_ir'] = cv[ind_columns].std(axis=1)
cv['cv_DM'] = cv['std_c']/cv['mean_c']
cv['cv_Plus'] = cv['std_i']/cv['mean_i']
cv['cv_Minus'] = cv['std_ir']/cv['mean_ir']
fig,ax=plt.subplots(figsize=(8,6))
sns.boxplot(data=cv[['cv_DM','cv_Plus','cv_Minus']],palette=palette_g,
showfliers=False,ax=ax)
#plt.savefig(os.path.join('Fig_S3A_cv.png'))
plt.savefig(os.path.join(OUT_FOLDER,'cv.png'))
plt.show()
cv.plot(x='mean_c',y='cv_DM',kind='scatter',marker='.')
plt.xscale('log')
plt.show()
cv.plot(x='mean_i',y='cv_Minus',kind='scatter',marker='.')
plt.xscale('log')
plt.show()
cv.plot(x='mean_ir',y='cv_Plus',kind='scatter',marker='.')
plt.xscale('log')
plt.show()
color_dictionary
fig, ax = plt.subplots()
ax = PTUT.make_mds(selection, palette, ax, top=500,
color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'mds.png'))
fig, ax = plt.subplots()
ax=PTUT.make_pca(selection, palette, ax, top=500,
color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'pca.png'))
selection.describe()
#PTUT??
normed_df = PTUT.norm_loading_TMT(selection)
normed_df.head()
fig, ax = plt.subplots()
ax = PTUT.make_mds(normed_df, palette, ax, top=500,
color_dictionary=color_dictionary)
plt.savefig(os.path.join(OUT_FOLDER,'mds.png'))
sns.boxplot(data =np.log10(normed_df),showfliers=False,palette=palette)
plt.title('Value Distribution')
plt.xlabel('Log10 LFQ')
plt.savefig(os.path.join(OUT_FOLDER,'value_distribution_box_normed.png'))
plt.show()
np.log10(normed_df).describe()
normed_df.columns
OUT_FOLDER
temp = df[['Gene_id','desc']].join(
np.log10(normed_df),how='right').replace(-np.inf,np.nan)
temp.to_csv(
os.path.join(OUT_FOLDER, 'indata_PolySTest.csv'),index=False)
def plot_bar(ingene='Tb427_000014300.1-p1',log=False,title='ESAG7:Tb427_000014300'):
temp = pd.read_csv(os.path.join(TXT_PATH,'proteinGroups.txt'),sep='\t')
fig,ax=plt.subplots()
s= temp[temp['Protein IDs'].str.contains(ingene)][cols]
s.columns = con_columns+ind_columns+ind_r_columns
if log:
np.log10(s).plot(kind='bar',color=palette,ax=ax)
else:
s.plot(kind='bar',color=palette,ax=ax)
ax.legend(title='Groups',loc='center left', bbox_to_anchor=(1, 0.9))
plt.title(title)
plt.savefig(os.path.join(OUT_FOLDER,ingene+'.png'))
plt.show()
#if ingene.replace('.1-p1','') in mapping_927_dict:
#for n in mapping_927_dict[ingene.replace('.1-p1','')]:
#print(n)
print (desc_927[ingene])
print(list(temp[temp['Protein IDs'].str.contains(ingene)]['Protein IDs']))
print('unique peptides:',
list(temp[temp['Protein IDs'].str.contains(ingene)]['Unique peptides']))
list(temp[temp['Protein IDs'].str.contains(ingene)]['Peptide counts (all)'])
plot_bar(ingene='Tb927.5.4450',log=False,title='Tb927.5.4450')
import ProtRank
data =ProtRank.load_data(os.path.join(OUT_FOLDER,'indata_prank.txt'),
separator = '\t',
ignore_cols = [], index_col = 0, comments = '#')
data.head()
#ProtRank.save_results??
OUT_FOLDER
what_to_compare = [[['WT1', 'Plus1'], ['WT2', 'Plus2'], ['WT3', 'Plus3']]]
ProtRank.data_stats(data, what_to_compare = what_to_compare, ignore_missed = True)
description = 'wt_vs_plus_sample_dataset'
significant_proteins = ProtRank.rank_proteins(
data, what_to_compare, description, path_to=OUT_FOLDER)
what_to_compare = [[['Plus1', 'Minus1'], ['Plus2', 'Minus2'], ['Plus3', 'Minus3']]]
description = 'plus_vs_minus_sample_dataset'
significant_proteins_r = ProtRank.rank_proteins(
data, what_to_compare, description, path_to=OUT_FOLDER)
common = set(significant_proteins) & set(significant_proteins_r)
to_viz = [n for n in significant_proteins if n in common]
print(len(to_viz))
what_to_compare = [[['WT1', 'Minus1'], ['WT2', 'Minus2'], ['WT3', 'Minus3']]]
description = 'wt_vs_minus_sample_dataset'
significant_proteins_r = ProtRank.rank_proteins(
data, what_to_compare, description, path_to=OUT_FOLDER)
def parse_prank_out(in_folder = OUT_FOLDER, infile='prs-con1_vs_ind1_sample_dataset.dat',suffix = 'a'):
df = pd.read_csv(os.path.join(in_folder, infile),
sep='\t', comment='#', index_col=[1],
names=['id','rank','FDR','sign'])
df.columns = [n+'_'+suffix for n in df.columns]
df['log_FDR_'+suffix]=-np.log10(df['FDR'+'_'+suffix])
df['log_rank_'+suffix]=np.log10(df['rank'+'_'+suffix])
df['srank_'+suffix]=[n*1 if a=='+' else n*-1 for n,a in
zip(df['rank'+'_'+suffix],df['sign'+'_'+suffix])]
return df
df_1 = parse_prank_out(infile='prs-plus_vs_minus_sample_dataset.dat',suffix = 'a')
df_2 = parse_prank_out(infile='prs-wt_vs_plus_sample_dataset.dat',suffix = 'b')
df_3 = parse_prank_out(infile='prs-wt_vs_minus_sample_dataset.dat',suffix = 'c')
print(df_1.shape,df_2.shape,df_3.shape)
merge = df_1.join(df_2).join(df_3)
print(merge.shape)
merge.sort_values('srank_a').head()
data.head()
data.columns
print(merge.shape)
merge = merge.join(data)
print(merge.shape)
merge['log2FC_a'] = np.log2(merge[['Plus1', 'Plus2', 'Plus3']].mean(axis=1)
/merge[['Minus1', 'Minus2', 'Minus3']].mean(axis=1))
merge['log2FC_b'] = np.log2(merge[['WT1', 'WT2', 'WT3']].mean(axis=1)
/ merge[['Plus1', 'Plus2', 'Plus3']].mean(axis=1))
merge['log2FC_c'] = np.log2(merge[['WT1', 'WT2', 'WT3']].mean(axis=1)
/ merge[['Minus1', 'Minus2', 'Minus3']].mean(axis=1))
merge['log10intensity'] = np.log10(merge[data.columns].mean(axis=1))
merge['desc']=[desc_927[n] for n in merge.index.values]
df.head()
desc_dict=dict(zip(df['Gene_id'],df['desc']))
desc_dict['Tb07.11L3.100']
merge.reset_index().merge(
df[['Protein IDs','Fasta headers','Gene_id']],
left_on='index',right_on='Gene_id').to_csv(os.path.join(OUT_FOLDER,'out.csv'))
temp = merge.reset_index()
temp = temp.rename({
'index':'Gene_id',
'srank_a':'Signed_Rank',
'log10intensity':'log10_TMT_intensity',
'FDR_a':'FDR',
'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
'indata_P_M.csv',index=False)
temp[['Gene_acc','Gene_id','Plus1','Plus2','Plus3',
'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_P_M.csv',index=False)
temp = merge.reset_index()
temp = temp.rename({
'index':'Gene_id',
'srank_b':'Signed_Rank',
'log10intensity':'log10_TMT_intensity',
'FDR_b':'FDR',
'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
'indata_D_P.csv',index=False)
temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
'Plus1','Plus2','Plus3','Desc']].to_csv('indata2_D_P.csv',index=False)
temp = merge.reset_index()
temp = temp.rename({
'index':'Gene_id',
'srank_c':'Signed_Rank',
'log10intensity':'log10_TMT_intensity',
'FDR_c':'FDR',
'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank','log10_TMT_intensity','FDR','Desc']].to_csv(
'indata_D_M.csv',index=False)
temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_D_M.csv',index=False)
temp = merge.reset_index()
temp = temp.rename({
'index':'Gene_id',
'log2FC_a':'Signed_Rank_Plus_vs_Minus',
'log2FC_b':'Signed_Rank_DM_vs_Plus',
'FDR_a':'FDR_Plus_vs_Minus',
'FDR_b':'FDR_DM_vs_Plus',
'desc':'Desc'},axis=1)
temp['Gene_acc']=temp.index.values
temp[['Gene_acc','Gene_id','Signed_Rank_DM_vs_Plus',
'FDR_DM_vs_Plus','Signed_Rank_Plus_vs_Minus',
'FDR_Plus_vs_Minus','Desc']].to_csv('indata_all.csv',index=False)
temp[['Gene_acc','Gene_id','WT1','WT2','WT3',
'Plus1','Plus2','Plus3',
'Minus1','Minus2','Minus3','Desc']].to_csv('indata2_all.csv',index=False)
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_a']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_a', y='FDR_a',
annot_index=selection_index,
annot_names = selection_names,
title='Volcano',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
add_text=False,point_size_selection=8)
PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_a',
annot_index=selection_index,
annot_names = selection_names,
title='MA',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('Plus_vs_Minus')
plt.show()
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_b']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_b', y='FDR_b',
annot_index=selection_index,
annot_names = selection_names,
title='Volcano',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
add_text=False,point_size_selection=8)
PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_b',
annot_index=selection_index,
annot_names = selection_names,
title='MA',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('WT_vs_Plus')
plt.show()
fig,axes=plt.subplots(figsize=(12,6), ncols=2, nrows=1)
selection = merge[merge['FDR_c']<0.01]
selection_index = selection.index.values
selection_names = [desc_dict.get(prot_id,'none') for prot_id in selection_index]
PTUT.make_vulcano(merge, axes[0], x='log2FC_c', y='FDR_c',
annot_index=selection_index,
annot_names = selection_names,
title='Volcano',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
add_text=False,point_size_selection=8)
PTUT.make_vulcano(merge, axes[1], x='log10intensity', y='log2FC_c',
annot_index=selection_index,
annot_names = selection_names,
title='MA',
label_for_selection='Regulated',
point_size_all=5,alpha_main=0.2,
point_size_selection=8,add_text=False)
axes[1].get_legend().remove()
axes[0].get_legend().remove()
axes[0].invert_yaxis()
plt.suptitle('WT_vs_Plus')
common_regulated = merge[
#(merge['srank_b']>0)&(merge['srank_a']>0)&
(merge['FDR_b']<0.01)&(merge['FDR_a']<0.01)
]
common_regulated.shape
merge['regulated']=['yes' if n in common_regulated.index.values else 'no' for n in merge.index.values]
common_regulated[['srank_a','FDR_a','srank_b','FDR_b','desc']]
#for item in list(desc_dict.keys()):
# print(item.split('.')[0])
plt.style.use('ggplot')
fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='log2FC_a', y='log2FC_b',
ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
common_regulated.index.values
].plot(kind='scatter',x='log2FC_a',y='log2FC_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('DOWN <-- Plus/Minus --> UP',fontsize=12)
ax.set_ylabel('Down <-- WT/Plus --> UP',fontsize=12)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
#none}
texts = [ax.text(merge.loc[i]['log2FC_a'],
merge.loc[i]['log2FC_b'],
desc_dict.get(i,'none'))
for i in common_regulated.index.values]
adjust_text(texts, arrowprops=dict(arrowstyle='->',
color='red'),ax=ax)
plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig1.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig1.svg'))
plt.show()
#'Tb927.11.13140',
look_for = ['Tb927.11.17000','Tb927.9.6090','Tb927.8.7700']
#for item in list(desc_dict.keys()):
# print(item.split('.')[0])
plt.style.use('ggplot')
fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='log2FC_a', y='log2FC_b',
ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
look_for
].plot(kind='scatter',x='log2FC_a',y='log2FC_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('DOWN <-- Plus/Minus --> UP',fontsize=12)
ax.set_ylabel('Down <-- DM/Plus --> UP',fontsize=12)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
#none}
texts = [ax.text(merge.loc[i]['log2FC_a'],
merge.loc[i]['log2FC_b'],
desc_dict.get(i,'none'))
for i in look_for]
adjust_text(texts, arrowprops=dict(arrowstyle='->',
color='red'),ax=ax)
plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig2.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig2.svg'))
plt.show()
look_for = ['Tb927.11.17000','Tb927.9.6090','Tb927.8.7700']
#for item in list(desc_dict.keys()):
# print(item.split('.')[0])
plt.style.use('ggplot')
fig,ax=plt.subplots(figsize=(12,8))
merge.plot(kind='scatter', x='FDR_a', y='FDR_b',
ax=ax,marker='.',alpha=0.2,label='MS Identified')
merge.loc[
look_for
].plot(kind='scatter',x='FDR_a',y='FDR_b',ax=ax,c='r',label='Significant')
ax.set_xlabel('FDR_a',fontsize=12)
ax.set_ylabel('FDR_b',fontsize=12)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),title= 'T. brucei proteins')
leg = ax.get_legend()
leg.legendHandles[0].set_color('Blue')
leg.legendHandles[0].set_alpha(1)
#none}
texts = [ax.text(merge.loc[i]['FDR_a'],
merge.loc[i]['FDR_b'],
desc_dict.get(i,'none'))
for i in look_for]
adjust_text(texts, arrowprops=dict(arrowstyle='->',
color='red'),ax=ax)
plt.title('Common')
plt.tight_layout()
plt.savefig(os.path.join(OUT_FOLDER,'Fig4.png'))
plt.savefig(os.path.join(OUT_FOLDER,'Fig4.svg'))
plt.show()
import plotly.express as px
merge.columns
import plotly.express as px
df = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(merge, x="log2FC_a", y="log2FC_b",
color="regulated",hover_name=merge.index.values,
hover_data=['FDR_a','FDR_b','desc'])
fig.write_html('scatter.html', auto_open=False)
fig.show()
for prot in list(common_regulated.index.values):
plot_bar(ingene=prot,log=False,title=prot+' '+desc_dict.get(prot,'none'))
!jupyter nbconvert --to html_toc analysis.ipynb